In [55]:
import os
import glob
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import xgboost as xgb

from scipy.stats import beta
from sklearn import preprocessing
In [2]:
pd.set_option('display.max_columns', None) # display all columns
pd.set_option('display.max_colwidth', -1)

Daywise search from October 2019 to March 2020

In [3]:
daywise_search = pd.read_csv('goa/daywise_search.csv', parse_dates=['searchDay'])
daywise_search.columns = ['searchDay','tid_count']
daywise_search = daywise_search.sort_values('searchDay')
fig = px.line(daywise_search, x='searchDay', y='tid_count', title='#unique tids each day')
fig.show()
In [4]:
startDate = pd.to_datetime('2019-12-01 00:00:00+05:30')
endDate = pd.to_datetime('2020-02-29 00:00:00+05:30')
print(startDate, endDate)
2019-12-01 00:00:00+05:30 2020-02-29 00:00:00+05:30
In [5]:
def dropNullColumns(df):
    return df.dropna(axis=1, how='all')
In [6]:
def dropColumnsWithSameValue(df):
    nunique = df.apply(pd.Series.nunique)
    cols_to_drop = nunique[nunique == 1].index
    return df.drop(columns=cols_to_drop)
In [7]:
def memory_usage(df):
    return(round(df.memory_usage(deep=True).sum() / 1024 ** 2, 2))

UGC data

redash query

  • questioncount: # of questions asked on platform
  • answercount: unverified data (140 answers for 1 question)
  • hotelreplycount: # of hotel replies on user reviews (unusable without review count)
  • approvedimagecount: can not measure quality based on this metric. pic can be of a torn bedsheet and approved
  • ta data not available in goa in redshift table
In [8]:
ugc_data = pd.read_csv('goa/ugc_data.csv', dtype={'localityid':'str','voyagerid':'str'})
location_data = ugc_data[['voyagerid','localityid','localityname']]
ugc_data.drop(columns=['localityid','localityname'], inplace=True)

ugc_data = dropNullColumns(ugc_data)
ugc_data = dropColumnsWithSameValue(ugc_data)
ugc_data
Out[8]:
voyagerid questioncount answercount hotelreplycount approvedimagecount reviewcount filteredreviewcounts_negative filteredreviewcounts_positive recommendation_val amenitiesrating amenitiesratingnew cleanlinessrating cleanlinessratingnew fdrating fdratingnew locrating locratingnew sqrating sqratingnew vfmrating detailedrating starrating hotelrating recentwindowrating relativerating laundry_like_count laundry_status laundry_total_like_count indoor_like_count indoor_act_status indoor_total_like_count outdoor_act_like_count outdoor_act_status outdoor_act_total_like_count gym_like_count gym_status gym_total_like_count parking_like_count parking_status parking_total_like_count pool_like_count pool_status pool_total_like_count reception_like_count reception_status reception_total_like_count restaurant_like_count restaurant_status restaurant_total_like_count room_service_like_count room_service_status room_service_total_like_count spa_like_count spa_status spa_total_like_count travel_assist_like_count travel_assist_status travel_assist_total_like_count wifi_like_count wifi_status wifi_total_like_count
0 100010595278878649 1.0 0.0 0.0 6.0 1.0 0.0 1.0 NaN 4.0 NaN 4.0 NaN 4.0 NaN 4.0 NaN 4.0 NaN 4.0 NaN 1.0 4.0 NaN 0.00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 1000927881786706505 0.0 0.0 2.0 0.0 5.0 0.0 5.0 100.0 5.0 NaN 5.0 NaN 4.0 NaN 4.8 NaN 4.7 NaN 4.8 4.56757 0.0 4.6 0.0 3897.80 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1010299432254190053 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN 0.0 NaN 0.0 NaN 0.0 NaN 0.0 NaN 0.0 NaN 0.0 NaN 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 1017893333879742102 2.0 7.0 1.0 9.0 1.0 NaN 1.0 NaN 4.6 NaN 4.6 NaN 4.0 NaN 4.6 NaN 4.6 NaN 5.0 NaN 0.0 4.6 NaN 0.00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 1018521387794100934 1.0 0.0 2.0 3.0 3.0 1.0 2.0 NaN 3.4 NaN 1.3 NaN 1.0 NaN 3.4 NaN 3.6 NaN 3.8 NaN 0.0 3.6 NaN 611.31 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7427 7035415141014666063 25.0 51.0 85.0 173.0 198.0 24.0 174.0 87.0 3.8 3.16 4.0 3.80 4.0 2.16 3.8 3.84 3.8 NaN 3.8 3.97978 0.0 4.0 3.8 5536.97 0.0 MAY BE 0.0 0.0 MAY BE 0.0 0.0 MAY BE 0.0 0.0 MAY BE 0.0 0.0 MAY BE 0.0 0.0 MAY BE 0.0 1.0 MAY BE 1.0 7.0 MAY BE 11.0 0.0 MAY BE 0.0 0.0 MAY BE 0.0 0.0 MAY BE 0.0 18.0 MAY BE 21.0
7428 7744131465824944941 90.0 201.0 531.0 543.0 937.0 99.0 838.0 89.0 3.9 4.09 3.8 3.66 3.0 NaN 4.1 4.26 3.9 4.50 4.0 4.06414 3.0 4.1 3.7 8308.61 1.0 MAY BE 2.0 0.0 MAY BE 0.0 0.0 MAY BE 2.0 1.0 NO 2.0 0.0 MAY BE 0.0 0.0 MAY BE 1.0 3.0 MAY BE 3.0 6.0 MAY BE 6.0 3.0 YES 3.0 1.0 NO 2.0 4.0 MAY BE 4.0 27.0 MAY BE 31.0
7429 8037099227187196550 81.0 248.0 369.0 491.0 552.0 80.0 472.0 85.0 4.0 3.43 4.1 3.01 3.0 3.16 3.9 3.16 4.0 3.72 4.0 3.94716 3.0 3.9 2.1 5678.84 4.0 MAY BE 4.0 11.0 MAY BE 16.0 14.0 MAY BE 15.0 1.0 NO 1.0 11.0 YES 14.0 27.0 YES 30.0 17.0 YES 20.0 16.0 YES 20.0 15.0 YES 17.0 0.0 NO 0.0 3.0 MAY BE 7.0 18.0 YES 22.0
7430 8966190385348905090 18.0 35.0 113.0 144.0 175.0 34.0 141.0 80.0 3.4 3.88 3.9 3.32 3.0 3.60 4.0 4.37 3.6 NaN 3.7 3.83660 3.0 3.8 3.6 3975.14 0.0 MAY BE 1.0 1.0 NO 1.0 1.0 MAY BE 1.0 0.0 NO 0.0 4.0 MAY BE 4.0 0.0 MAY BE 0.0 3.0 MAY BE 4.0 12.0 YES 14.0 2.0 MAY BE 3.0 0.0 MAY BE 0.0 2.0 MAY BE 2.0 10.0 YES 10.0
7431 9156345353853789599 4.0 15.0 15.0 35.0 91.0 31.0 60.0 65.0 2.9 NaN 3.1 NaN 3.0 NaN 2.9 NaN 2.9 NaN 2.9 3.14926 0.0 3.1 2.4 1023.36 0.0 MAY BE 0.0 2.0 MAY BE 2.0 1.0 MAY BE 1.0 0.0 MAY BE 0.0 6.0 MAY BE 6.0 0.0 MAY BE 0.0 3.0 MAY BE 4.0 3.0 MAY BE 3.0 1.0 MAY BE 1.0 1.0 MAY BE 1.0 0.0 MAY BE 0.0 1.0 MAY BE 3.0

7432 rows × 61 columns

In [9]:
ugc_data_columns = ugc_data.count() / ugc_data.shape[0]
ugc_data_columns = ugc_data_columns[ugc_data_columns > 0.4].index.to_list()
ugc_data = ugc_data[ugc_data_columns]
ugc_data
Out[9]:
voyagerid questioncount answercount hotelreplycount approvedimagecount reviewcount filteredreviewcounts_negative filteredreviewcounts_positive recommendation_val amenitiesrating cleanlinessrating fdrating locrating sqrating vfmrating detailedrating starrating hotelrating recentwindowrating relativerating
0 100010595278878649 1.0 0.0 0.0 6.0 1.0 0.0 1.0 NaN 4.0 4.0 4.0 4.0 4.0 4.0 NaN 1.0 4.0 NaN 0.00
1 1000927881786706505 0.0 0.0 2.0 0.0 5.0 0.0 5.0 100.0 5.0 5.0 4.0 4.8 4.7 4.8 4.56757 0.0 4.6 0.0 3897.80
2 1010299432254190053 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 NaN NaN
3 1017893333879742102 2.0 7.0 1.0 9.0 1.0 NaN 1.0 NaN 4.6 4.6 4.0 4.6 4.6 5.0 NaN 0.0 4.6 NaN 0.00
4 1018521387794100934 1.0 0.0 2.0 3.0 3.0 1.0 2.0 NaN 3.4 1.3 1.0 3.4 3.6 3.8 NaN 0.0 3.6 NaN 611.31
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7427 7035415141014666063 25.0 51.0 85.0 173.0 198.0 24.0 174.0 87.0 3.8 4.0 4.0 3.8 3.8 3.8 3.97978 0.0 4.0 3.8 5536.97
7428 7744131465824944941 90.0 201.0 531.0 543.0 937.0 99.0 838.0 89.0 3.9 3.8 3.0 4.1 3.9 4.0 4.06414 3.0 4.1 3.7 8308.61
7429 8037099227187196550 81.0 248.0 369.0 491.0 552.0 80.0 472.0 85.0 4.0 4.1 3.0 3.9 4.0 4.0 3.94716 3.0 3.9 2.1 5678.84
7430 8966190385348905090 18.0 35.0 113.0 144.0 175.0 34.0 141.0 80.0 3.4 3.9 3.0 4.0 3.6 3.7 3.83660 3.0 3.8 3.6 3975.14
7431 9156345353853789599 4.0 15.0 15.0 35.0 91.0 31.0 60.0 65.0 2.9 3.1 3.0 2.9 2.9 2.9 3.14926 0.0 3.1 2.4 1023.36

7432 rows × 20 columns

In [10]:
recentwindowrating_median = ugc_data['recentwindowrating'].replace(0., np.NaN).median()
relativerating_median = ugc_data['relativerating'].replace(0., np.NaN).median()
recommendation_val_median = ugc_data['recommendation_val'].replace(0., np.NaN).median()

ugc_data.loc[(ugc_data['recentwindowrating'] == 0.) | ugc_data['recentwindowrating'].isna(), 'recentwindowrating'] = recentwindowrating_median
ugc_data.loc[(ugc_data['relativerating'] == 0.) | ugc_data['relativerating'].isna(), 'relativerating'] = relativerating_median
ugc_data.loc[(ugc_data['recommendation_val'] == 0.) | ugc_data['recommendation_val'].isna(), 'recommendation_val'] = recommendation_val_median
ugc_data.loc[(ugc_data['reviewcount'] <= 0.) | ugc_data['reviewcount'].isna(), 'reviewcount'] = 1.
In [11]:
ugc_data.plot(x='reviewcount', y='detailedrating', kind='hexbin', sharex=False,
              xscale='log', cmap='YlGnBu', gridsize=8, mincnt=1,
              title="detailed rating by review count")
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c2b88ea90>
In [12]:
ugc_data.loc[ugc_data['detailedrating'].isna(), 'detailedrating'] = 0.
ugc_data['bayesian_rating'] = ((50*3.5) + (ugc_data['reviewcount']*ugc_data['detailedrating']))/(50+ugc_data['reviewcount'])
ugc_data
Out[12]:
voyagerid questioncount answercount hotelreplycount approvedimagecount reviewcount filteredreviewcounts_negative filteredreviewcounts_positive recommendation_val amenitiesrating cleanlinessrating fdrating locrating sqrating vfmrating detailedrating starrating hotelrating recentwindowrating relativerating bayesian_rating
0 100010595278878649 1.0 0.0 0.0 6.0 1.0 0.0 1.0 87.0 4.0 4.0 4.0 4.0 4.0 4.0 0.00000 1.0 4.0 3.9 2435.81 3.431373
1 1000927881786706505 0.0 0.0 2.0 0.0 5.0 0.0 5.0 100.0 5.0 5.0 4.0 4.8 4.7 4.8 4.56757 0.0 4.6 3.9 3897.80 3.597052
2 1010299432254190053 0.0 0.0 0.0 0.0 1.0 NaN NaN 87.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 3.9 2435.81 3.431373
3 1017893333879742102 2.0 7.0 1.0 9.0 1.0 NaN 1.0 87.0 4.6 4.6 4.0 4.6 4.6 5.0 0.00000 0.0 4.6 3.9 2435.81 3.431373
4 1018521387794100934 1.0 0.0 2.0 3.0 3.0 1.0 2.0 87.0 3.4 1.3 1.0 3.4 3.6 3.8 0.00000 0.0 3.6 3.9 611.31 3.301887
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7427 7035415141014666063 25.0 51.0 85.0 173.0 198.0 24.0 174.0 87.0 3.8 4.0 4.0 3.8 3.8 3.8 3.97978 0.0 4.0 3.8 5536.97 3.883050
7428 7744131465824944941 90.0 201.0 531.0 543.0 937.0 99.0 838.0 89.0 3.9 3.8 3.0 4.1 3.9 4.0 4.06414 3.0 4.1 3.7 8308.61 4.035561
7429 8037099227187196550 81.0 248.0 369.0 491.0 552.0 80.0 472.0 85.0 4.0 4.1 3.0 3.9 4.0 4.0 3.94716 3.0 3.9 2.1 5678.84 3.910020
7430 8966190385348905090 18.0 35.0 113.0 144.0 175.0 34.0 141.0 80.0 3.4 3.9 3.0 4.0 3.6 3.7 3.83660 3.0 3.8 3.6 3975.14 3.761800
7431 9156345353853789599 4.0 15.0 15.0 35.0 91.0 31.0 60.0 65.0 2.9 3.1 3.0 2.9 2.9 2.9 3.14926 0.0 3.1 2.4 1023.36 3.273636

7432 rows × 21 columns

In [13]:
for starRating in ugc_data['starrating'].unique():
    if pd.notna(starRating):
        ugc_data['{}star'.format(int(starRating))] = ugc_data['starrating'] == starRating
ugc_data.drop(columns=['amenitiesrating','cleanlinessrating','fdrating','locrating','sqrating','vfmrating',
                       'questioncount','answercount','hotelreplycount','approvedimagecount','reviewcount','starrating',
                       'filteredreviewcounts_negative','filteredreviewcounts_positive','hotelrating','detailedrating'],
              inplace=True)
ugc_data.set_index('voyagerid', inplace=True)
ugc_data
Out[13]:
recommendation_val recentwindowrating relativerating bayesian_rating 1star 0star 3star 2star 4star 5star
voyagerid
100010595278878649 87.0 3.9 2435.81 3.431373 True False False False False False
1000927881786706505 100.0 3.9 3897.80 3.597052 False True False False False False
1010299432254190053 87.0 3.9 2435.81 3.431373 False True False False False False
1017893333879742102 87.0 3.9 2435.81 3.431373 False True False False False False
1018521387794100934 87.0 3.9 611.31 3.301887 False True False False False False
... ... ... ... ... ... ... ... ... ... ...
7035415141014666063 87.0 3.8 5536.97 3.883050 False True False False False False
7744131465824944941 89.0 3.7 8308.61 4.035561 False False True False False False
8037099227187196550 85.0 2.1 5678.84 3.910020 False False True False False False
8966190385348905090 80.0 3.6 3975.14 3.761800 False False True False False False
9156345353853789599 65.0 2.4 1023.36 3.273636 False True False False False False

7432 rows × 10 columns

Voyager meta data

Data is collected in augur db from voyager api

In [14]:
voyager_meta_data = pd.read_csv('goa/htl_meta_data.csv',
                                dtype={'hotel_voyager_id':'object', 'city_id':'object'},
                                parse_dates=['created_date','updated_time'])

hotelTypes = set(voyager_meta_data.hotel_list_type[~voyager_meta_data.hotel_list_type.isnull()].str.cat(sep=',').split(','))
for hotelType in hotelTypes:
    voyager_meta_data[hotelType] = voyager_meta_data['hotel_list_type'].str.contains(hotelType)
    
for propertyType in voyager_meta_data.property_type.unique():
    if (pd.notna(propertyType)):
        voyager_meta_data[propertyType.lower()] = voyager_meta_data['property_type'] == propertyType

voyager_meta_data = dropNullColumns(voyager_meta_data)

location_data = voyager_meta_data[['hotel_voyager_id','hotel_name','longitude','latitude']].merge(location_data, how='outer',
                                                                           left_on='hotel_voyager_id', 
                                                                           right_on='voyagerid')

# add oyo
voyager_meta_data['oyo'] = voyager_meta_data['chain_code'].str.startswith('oyo').fillna(False)

# ignoring chains since there are 74 chains
voyager_meta_data.drop(columns=['city_name','hotel_name','chain_code','is_chain','longitude','latitude','is_ingo',
                                'updated_time','is_active','property_type','focus_mapping','is_fraud',
                                'created_date','hotel_list_type','col_segment'], inplace=True)

voyager_meta_data.fillna(False, inplace=True)
voyager_meta_data = dropColumnsWithSameValue(voyager_meta_data)

voyager_meta_data.set_index('hotel_voyager_id', inplace=True)
voyager_meta_data
Out[14]:
gohomes go_stay alt_acco goibibo_main cottage service_apartment villa hotel homestay guest_house aparthotel resort tent hostel bnb beach_hut holiday_homes farm_stay bungalow lodge houseboat oyo
hotel_voyager_id
100010595278878649 False False False False True False False False False False False False False False False False False False False False False False
1000927881786706505 False False False False False False False False False False False False False False False False False False False False False False
100095815031859733 True False True True False True False False False False False False False False False False False False False False False False
1001896180279830144 True False True True False True False False False False False False False False False False False False False False False False
1002507148357869567 True False True True False False True False False False False False False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
997673724526086981 True False True True False False False False True False False False False False False False False False False False False False
997757958227252856 True False True True False False False False True False False False False False False False False False False False False False
998698450105556216 True False True True False False True False False False False False False False False False False False False False False False
999270414876195337 False False False False False False False False False False False False False False False False False False False False False False
999539595327370963 False False False True False False False True False False False False False False False False False False False False False True

9235 rows × 22 columns

Location data

Extracted from voyager data (from augur db) and ugc query (from redshift)

In [15]:
location_data['vhid'] = location_data.hotel_voyager_id.combine_first(location_data.voyagerid)
location_data = location_data[['vhid','hotel_name','localityname','localityid','latitude','longitude']]
location_data
# locality features - which are popular
Out[15]:
vhid hotel_name localityname localityid latitude longitude
0 100010595278878649 Sea View Rooms Vedika Wooden Cottages Alto De Porvorim 8995661238375677842 15.509267 73.822708
1 1000927881786706505 Studio Apartment Goa Arpora - North 8760168679231251755 15.567881 73.760761
2 100095815031859733 2bhk woodstay apartment NaN NaN 15.513779 73.768936
3 1001896180279830144 Riva Boutique Apartments Bardez 4719923507833303064 15.537896 73.819664
4 1002507148357869567 SaffronStays Nossa Bela Casa NaN NaN 15.352792 73.899330
... ... ... ... ... ... ...
10205 8341503309335230493 NaN Candolim Area 2610008089602176103 NaN NaN
10206 8843928944697323059 NaN Calangute Area 4395858004770232814 NaN NaN
10207 8916134427622002479 NaN NaN NaN NaN NaN
10208 4312879489243262818 NaN Varca Area 5635648677645910983 NaN NaN
10209 1843002780602584011 NaN Baga Area 3575894941634872361 NaN NaN

10210 rows × 6 columns

In [16]:
px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(location_data[location_data['localityid'].isin(location_data.groupby('localityid').agg({'vhid':'count'}).sort_values('vhid',ascending=False).head(20).index.to_list())], 
                        lat="latitude", lon="longitude", color="localityname", text='hotel_name')
fig.show()

NPS data

redash query

In [17]:
nps_data = pd.read_csv('goa/nps_data.csv', dtype={'voyagerid':'object'})
nps_data = nps_data.groupby('voyagerid').agg({'promoters':'sum','nutral':'sum','detractors':'sum'}).reset_index()
nps_data['nps_count'] = (nps_data['promoters'] + nps_data['nutral'] + nps_data['detractors'])
nps_data['nps_score'] = (nps_data['promoters'] - nps_data['detractors']) / nps_data['nps_count']
nps_data['non_detractor'] = (nps_data['promoters'] + nps_data['nutral']) / nps_data['nps_count']
nps_data.sort_values('nps_count', ascending=False)
Out[17]:
voyagerid promoters nutral detractors nps_count nps_score non_detractor
258 2949253418177622573 99 23 14 136 0.625000 0.897059
704 6359680374160639334 65 32 15 112 0.446429 0.866071
95 1695099252554833475 58 19 16 93 0.451613 0.827957
131 1921032075991056700 58 11 11 80 0.587500 0.862500
1044 9215850595846281322 33 14 29 76 0.052632 0.618421
... ... ... ... ... ... ... ...
698 6282048734318286342 0 0 1 1 -1.000000 0.000000
265 2975674741608875787 0 1 0 1 0.000000 1.000000
691 6207242888830290539 0 0 1 1 -1.000000 0.000000
46 1344106421604609218 1 0 0 1 1.000000 1.000000
40 1315707514476566300 1 0 0 1 1.000000 1.000000

1056 rows × 7 columns

In [18]:
# nps data not enough
nps_data.plot(x='nps_count', y='nps_score', kind='hexbin', sharex=False,
              cmap='YlGnBu', gridsize=6, mincnt=1,
              title="nps score by count")
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c2b520ad0>
In [19]:
nps_data.plot(x='nps_count', y='non_detractor', kind='hexbin', sharex=False,
              cmap='YlGnBu', gridsize=6, mincnt=1,
              title="nps score by count")
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c2acfb090>
In [20]:
nps_data['bayesian_nps'] = ((10*0) + (nps_data['promoters'] - nps_data['detractors']))/(10+nps_data['nps_count'])
nps_data['bayesian_non_detractor'] = ((10*0.5) + (nps_data['promoters'] + nps_data['nutral']))/(10+nps_data['nps_count'])
nps_data = nps_data[['voyagerid','nps_count','nps_score','bayesian_nps','non_detractor','bayesian_non_detractor']]
In [21]:
nps_data = nps_data[['voyagerid','bayesian_nps']].set_index('voyagerid')
nps_data
Out[21]:
bayesian_nps
voyagerid
1016714172799714644 0.133333
102297862189597319 0.230769
1023509060586808527 0.304348
1031336366232351629 0.391304
1033189131752729495 -0.090909
... ...
944862494189169789 0.230769
946284132832868451 0.230769
981922768591636694 0.520000
990321239720252665 0.000000
996905128890517746 -0.142857

1056 rows × 1 columns

Static data merged

In [22]:
static_data = ugc_data.merge(voyager_meta_data, left_index=True, right_index=True, how='outer')
static_data = static_data.merge(nps_data, left_index=True, right_index=True, how='outer')

static_data['recommendation_val'] = static_data['recommendation_val'].fillna(recommendation_val_median)
static_data['recentwindowrating'] = static_data['recentwindowrating'].fillna(recentwindowrating_median)
static_data['relativerating'] = static_data['relativerating'].fillna(relativerating_median)
static_data['bayesian_rating'] = static_data['bayesian_rating'].fillna(3.5)
static_data['bayesian_nps'] = static_data['bayesian_nps'].fillna(0.)
static_data.fillna(False, inplace=True)

scaler = preprocessing.MinMaxScaler()
static_data = pd.DataFrame(scaler.fit_transform(static_data), index=static_data.index, columns=static_data.columns)
static_data = static_data.reset_index()
static_data
Out[22]:
index recommendation_val recentwindowrating relativerating bayesian_rating 1star 0star 3star 2star 4star 5star gohomes go_stay alt_acco goibibo_main cottage service_apartment villa hotel homestay guest_house aparthotel resort tent hostel bnb beach_hut holiday_homes farm_stay bungalow lodge houseboat oyo bayesian_nps
0 100010595278878649 0.845238 0.676471 0.148222 0.720183 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249
1 1000927881786706505 1.000000 0.676471 0.237192 0.760533 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249
2 100095815031859733 0.845238 0.676471 0.148222 0.736897 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249
3 1001896180279830144 0.928571 0.764706 0.698521 0.931869 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249
4 1002507148357869567 0.845238 0.676471 0.148222 0.736897 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10205 997757958227252856 0.761905 0.676471 0.075372 0.744187 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249
10206 998698450105556216 0.845238 0.676471 0.148222 0.736897 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249
10207 999270414876195337 0.845238 0.676471 0.148222 0.720183 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249
10208 999308668836199749 0.845238 0.676471 0.148222 0.720183 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249
10209 999539595327370963 0.702381 0.676471 0.059704 0.732422 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.343249

10210 rows × 34 columns

Ingo content data

redash query

Check for contact detail feature for other cities. In this city, all hotels have contact details updated

In [23]:
ingo_content_data = pd.read_csv('goa/ingo_content_score.csv',
                                dtype={'hotelcode':'object','voyagerid':'object','cityid':'object'})
ingo_content_data['modifiedon'] = pd.to_datetime(ingo_content_data['modifiedon'], unit='ms')
ingo_content_data['modified_date'] = ingo_content_data['modifiedon'].dt.date
ingo_content_data = dropNullColumns(ingo_content_data)
ingo_content_data = dropColumnsWithSameValue(ingo_content_data)
ingo_content_data.fillna(0., inplace=True)
ingo_content_data.drop(columns=['hotelcode','hoteltype','totalcontentscore'], inplace=True)
ingo_content_data
Out[23]:
modifiedon voyagerid description hotelamenities roomamenities hotelimage roomimage roomdetails modified_date
0 2020-04-07 18:35:29.137 6710628460158842741 5.000 10.0 10.0 42.00 28.0 5.0 2020-04-07
1 2020-02-27 10:16:32.872 1768137749300482275 0.375 0.0 0.0 15.00 28.0 5.0 2020-02-27
2 2020-03-30 00:41:32.047 9016090624151231988 5.000 10.0 10.0 42.00 28.0 5.0 2020-03-30
3 2020-02-27 10:17:19.601 4636429859410864054 5.000 0.0 10.0 42.00 28.0 5.0 2020-02-27
4 2020-04-11 00:41:24.573 4890762389184698251 5.000 10.0 10.0 42.00 28.0 5.0 2020-04-11
... ... ... ... ... ... ... ... ... ...
76556 2019-01-28 15:55:11.000 2221713385180617355 5.000 10.0 10.0 24.00 0.0 5.0 2019-01-28
76557 2019-04-03 18:04:51.200 1683808161966154570 5.000 9.0 10.0 35.25 28.0 5.0 2019-04-03
76558 2020-01-02 11:15:07.261 2682542980143965756 0.000 10.0 10.0 0.00 9.9 5.0 2020-01-02
76559 2019-05-14 07:07:30.295 627708254182763984 0.000 0.0 0.0 0.00 15.0 0.0 2019-05-14
76560 2019-10-30 23:00:36.435 5602109079040906794 5.000 10.0 10.0 28.50 14.0 5.0 2019-10-30

76561 rows × 9 columns

In [24]:
ingo_content_data_deduped = ingo_content_data.groupby(['voyagerid','modified_date']).agg({'modifiedon':'max'})[2:].reset_index().drop(columns=['modified_date'])
ingo_content_data_deduped = ingo_content_data.merge(ingo_content_data_deduped, on=['voyagerid','modifiedon']).drop(columns=['modifiedon'])
ingo_content_data_deduped
Out[24]:
voyagerid description hotelamenities roomamenities hotelimage roomimage roomdetails modified_date
0 6710628460158842741 5.000 10.0 10.0 42.0 28.0 5.0 2020-04-07
1 1768137749300482275 0.375 0.0 0.0 15.0 28.0 5.0 2020-02-27
2 9016090624151231988 5.000 10.0 10.0 42.0 28.0 5.0 2020-03-30
3 4636429859410864054 5.000 0.0 10.0 42.0 28.0 5.0 2020-02-27
4 4890762389184698251 5.000 10.0 10.0 42.0 28.0 5.0 2020-04-11
... ... ... ... ... ... ... ... ...
59361 6761584605229267727 0.000 4.0 10.0 0.0 0.0 2.5 2020-03-24
59362 8871143215413788554 0.000 2.0 10.0 19.5 28.0 2.5 2019-12-19
59363 2533937237251174711 5.000 8.0 0.0 0.0 0.0 0.0 2019-12-19
59364 2682542980143965756 0.000 10.0 10.0 0.0 0.0 5.0 2020-01-02
59365 627708254182763984 0.000 0.0 0.0 0.0 15.0 0.0 2019-05-14

59366 rows × 8 columns

In [25]:
ingo_content_data_init = ingo_content_data_deduped[ingo_content_data_deduped['modified_date']<=pd.to_datetime('2020-01-01')].groupby('voyagerid').agg({'modified_date':'max'}).reset_index()
ingo_content_data_init = ingo_content_data_deduped.merge(ingo_content_data_init, on=['voyagerid','modified_date'])
ingo_content_data_init['modified_date'] = pd.to_datetime('2020-01-01').date()
ingo_content_data_init
Out[25]:
voyagerid description hotelamenities roomamenities hotelimage roomimage roomdetails modified_date
0 3645512695323823228 5.0 8.00000 10.0 42.00 28.0 5.0 2020-01-01
1 25953101383255308 5.0 8.66667 10.0 39.75 28.0 2.5 2020-01-01
2 5236912984033722314 5.0 0.00000 10.0 42.00 28.0 5.0 2020-01-01
3 4963878413800973493 5.0 0.00000 10.0 19.50 28.0 2.5 2020-01-01
4 5710820998712504158 5.0 9.33333 10.0 42.00 28.0 5.0 2020-01-01
... ... ... ... ... ... ... ... ...
11334 9001318939808335694 5.0 10.00000 0.0 42.00 15.0 0.0 2020-01-01
11335 4302009724015539768 5.0 4.00000 0.0 0.00 0.0 5.0 2020-01-01
11336 8106335883333444380 0.0 8.00000 10.0 0.00 0.0 0.0 2020-01-01
11337 2816578955835612713 5.0 10.00000 10.0 0.00 0.0 5.0 2020-01-01
11338 4546765944405292759 0.0 0.00000 10.0 0.00 15.0 0.0 2020-01-01

11339 rows × 8 columns

In [26]:
ingo_content_data_deduped = ingo_content_data_deduped[ingo_content_data_deduped['modified_date']>pd.to_datetime('2020-01-01')]
ingo_content_data = pd.concat([ingo_content_data_init, ingo_content_data_deduped], axis=0, ignore_index=True)
ingo_content_data['modified_date'] = pd.to_datetime(ingo_content_data['modified_date'])
ingo_content_data.rename(columns={'voyagerid':'vhid','modified_date':'dt'}, inplace=True)
ingo_content_data
Out[26]:
vhid description hotelamenities roomamenities hotelimage roomimage roomdetails dt
0 3645512695323823228 5.0 8.00000 10.0 42.00 28.0 5.0 2020-01-01
1 25953101383255308 5.0 8.66667 10.0 39.75 28.0 2.5 2020-01-01
2 5236912984033722314 5.0 0.00000 10.0 42.00 28.0 5.0 2020-01-01
3 4963878413800973493 5.0 0.00000 10.0 19.50 28.0 2.5 2020-01-01
4 5710820998712504158 5.0 9.33333 10.0 42.00 28.0 5.0 2020-01-01
... ... ... ... ... ... ... ... ...
25592 5762067864339756505 5.0 10.00000 10.0 42.00 28.0 5.0 2020-03-07
25593 8676657096963095637 5.0 10.00000 10.0 40.20 28.0 2.5 2020-03-03
25594 1175938368590215369 0.0 5.00000 10.0 0.00 28.0 2.5 2020-03-09
25595 6761584605229267727 0.0 4.00000 10.0 0.00 0.0 2.5 2020-03-24
25596 2682542980143965756 0.0 10.00000 10.0 0.00 0.0 5.0 2020-01-02

25597 rows × 8 columns

Transaction data

In [27]:
transactions = pd.read_csv('goa/transactions.csv', parse_dates=['bookingDate'], dtype={'vhid':'object'})
transactions.rename(columns={'transactions':'rn','bookingDate':'dt'}, inplace=True)
transactions
Out[27]:
dt vhid rn asp
0 2020-01-24 8127904297250823253 2 4652.000000
1 2020-02-14 3576136520949843682 7 1786.437744
2 2020-02-14 5865824439186584633 2 3922.750000
3 2020-02-18 2331396032248919252 3 1015.250000
4 2020-02-29 6289364780871957457 3 977.333313
... ... ... ... ...
23009 2020-01-04 7436089683105729674 4 6471.250000
23010 2020-01-21 5801163250522061014 3 5544.000000
23011 2020-01-25 5233079379265104048 1 50.000000
23012 2020-02-14 6814796127018891366 2 4273.779785
23013 2020-02-10 6110030067260149430 1 976.000000

23014 rows × 4 columns

In [28]:
mean_asp_by_vhid = transactions.groupby('vhid').agg({'asp':'mean'})
mean_asp_by_date = transactions.groupby('dt').agg({'asp':'mean'})

Daily funnel data

Bayesian inference to calculate CTR/CR

x = detail_hits in case of CTR | bookings in case of CR

N = impressions in case of CTR | detail_hits in case of CR

x_mean = mean value of x across all hotels in the city for the day

N_mean = mean value of N across all hotels in the city for the day

alpha_coefficient = x_mean

beta_coefficient = N_mean - x_mean

  • Prior belief: beta(alpha_coefficient, beta_coefficient)
  • Likelihood function: binomial function (since it is modelled as a bernoulli trial)
  • Posterior belief: (alpha_coefficient + x, beta_coefficient + N - x)

Borrowed from here

In [29]:
# daily_funnel_data = pd.read_csv('goa/daily_funnel_data.csv', parse_dates=['searchDate'], dtype={'searchVhid':'object'})
# daily_funnel_data.rename(columns={'searchDate':'dt','searchVhid':'vhid'}, inplace=True)
# daily_funnel_data = daily_funnel_data.replace(0., np.NaN)

# funnel_data_grouped = daily_funnel_data.groupby('dt').agg({'searches':'mean','detailClicks':'mean','bookings':'mean'})
# daily_funnel_data = daily_funnel_data.merge(funnel_data_grouped, how='left', left_on='dt', right_index=True).fillna(0.)

# # cr and ctr are calculated using bayesian inference
# daily_funnel_data['ctr'] = daily_funnel_data.apply(lambda d: beta.mean(d.detailClicks_y + d.detailClicks_x, d.searches_y - d.detailClicks_y + d.searches_x - d.detailClicks_x), axis=1)
# daily_funnel_data['cr'] = daily_funnel_data.apply(lambda d: beta.mean(d.bookings_y + d.bookings_x, d.detailClicks_y - d.bookings_y + d.detailClicks_x - d.bookings_x), axis=1)
# # daily_funnel_data = daily_funnel_data[['date','vhid','ctr','cr']]

# daily_funnel_data.rename(columns={'searches_x':'searches','detailClicks_x':'detailClicks','bookings_x':'bookings',
#                                   'searches_y':'searches_mean','detailClicks_y':'detailClicks_mean','bookings_y':'bookings_mean'}, inplace=True)
# daily_funnel_data.to_csv('goa/daily_funnel_data_beta.csv', index=False)

# daily_funnel_data
In [30]:
daily_funnel_data = pd.read_csv('goa/daily_funnel_data_beta.csv', parse_dates=['dt'], dtype={'vhid':'object'})
daily_funnel_data
Out[30]:
dt vhid searches detailClicks bookings searches_mean detailClicks_mean bookings_mean ctr cr
0 2020-01-04 4534496020989252375 1815 248.0 1.0 287.590172 29.684307 1.416309 0.132068 0.008702
1 2020-02-04 5800911411502315516 395 24.0 0.0 151.295090 27.801673 1.573034 0.094824 0.030366
2 2020-01-05 2969550203568856275 851 57.0 0.0 294.306338 30.765071 1.500000 0.076630 0.017091
3 2020-01-05 5770867326704776265 890 87.0 0.0 294.306338 30.765071 1.500000 0.099438 0.012737
4 2020-01-21 1646863333716315870 5164 813.0 0.0 160.008386 28.686632 1.521327 0.158093 0.001807
... ... ... ... ... ... ... ... ... ... ...
252746 2020-02-04 8236013734655448053 2 1.0 0.0 151.295090 27.801673 1.573034 0.187884 0.054616
252747 2020-01-08 5569211629788906692 1 0.0 0.0 182.809746 29.601695 1.497925 0.161045 0.050603
252748 2020-01-07 6717336120115810509 1 0.0 0.0 187.088777 29.428375 1.565611 0.156460 0.053201
252749 2020-01-08 2499454446305811421 2 0.0 0.0 182.809746 29.601695 1.497925 0.160174 0.050603
252750 2020-02-20 8224434293395554018 1 0.0 0.0 181.130403 28.433513 1.447581 0.156116 0.050911

252751 rows × 10 columns

In [31]:
quantiles = daily_funnel_data.groupby('dt').agg({'cr':[lambda x: np.quantile(x, 0.25), lambda x: np.quantile(x, 0.75)],
                                     'ctr':[lambda x: np.quantile(x, 0.25), lambda x: np.quantile(x, 0.75)]})
quantiles.columns = ['cr_q1', 'cr_q2', 'ctr_q1', 'ctr_q2']
quantiles['cr_iqr'] = quantiles['cr_q2'] - quantiles['cr_q1']
quantiles['ctr_iqr'] = quantiles['ctr_q2'] - quantiles['ctr_q1']
quantiles['cr_l'] = quantiles['cr_q1'] - 1.5*quantiles['cr_iqr']
quantiles['cr_u'] = quantiles['cr_q2'] + 1.5*quantiles['cr_iqr']
quantiles['ctr_l'] = quantiles['ctr_q1'] - 1.5*quantiles['ctr_iqr']
quantiles['ctr_u'] = quantiles['ctr_q2'] + 1.5*quantiles['ctr_iqr']
quantiles = quantiles[['cr_l','cr_u','ctr_l','ctr_u']]
quantiles.head()
Out[31]:
cr_l cr_u ctr_l ctr_u
dt
2020-01-01 0.032665 0.062088 0.030232 0.159765
2020-01-02 0.028246 0.054938 0.029770 0.134682
2020-01-03 0.028130 0.059479 0.025173 0.134506
2020-01-04 0.027656 0.059746 0.024569 0.125118
2020-01-05 0.028864 0.060692 0.026053 0.127354
In [32]:
daily_funnel_data = daily_funnel_data.merge(quantiles, on=['dt'])
daily_funnel_data.loc[daily_funnel_data['ctr'] < daily_funnel_data['ctr_l'], 'ctr'] = daily_funnel_data['ctr_l']
daily_funnel_data.loc[daily_funnel_data['ctr'] > daily_funnel_data['ctr_u'], 'ctr'] = daily_funnel_data['ctr_u']
daily_funnel_data.loc[daily_funnel_data['cr'] < daily_funnel_data['cr_l'], 'ctr'] = daily_funnel_data['cr_l']
daily_funnel_data.loc[daily_funnel_data['cr'] > daily_funnel_data['cr_u'], 'ctr'] = daily_funnel_data['cr_u']
daily_funnel_data
Out[32]:
dt vhid searches detailClicks bookings searches_mean detailClicks_mean bookings_mean ctr cr cr_l cr_u ctr_l ctr_u
0 2020-01-04 4534496020989252375 1815 248.0 1.0 287.590172 29.684307 1.416309 0.027656 0.008702 0.027656 0.059746 0.024569 0.125118
1 2020-01-04 7045690507975564635 514 12.0 0.0 287.590172 29.684307 1.416309 0.052002 0.033977 0.027656 0.059746 0.024569 0.125118
2 2020-01-04 822661676043348693 741 9.0 0.0 287.590172 29.684307 1.416309 0.037609 0.036612 0.027656 0.059746 0.024569 0.125118
3 2020-01-04 4797644311105828251 638 18.0 0.0 287.590172 29.684307 1.416309 0.051518 0.029702 0.027656 0.059746 0.024569 0.125118
4 2020-01-04 9141059828240372790 338 7.0 0.0 287.590172 29.684307 1.416309 0.058640 0.038608 0.027656 0.059746 0.024569 0.125118
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
252746 2020-01-31 1314584744655476113 32 0.0 0.0 116.252388 24.317703 1.510870 0.164029 0.062130 0.040190 0.075295 0.063958 0.267921
252747 2020-01-31 4377937175711091410 15 0.0 0.0 116.252388 24.317703 1.510870 0.185274 0.062130 0.040190 0.075295 0.063958 0.267921
252748 2020-01-31 4665667295838472450 2 0.0 0.0 116.252388 24.317703 1.510870 0.205642 0.062130 0.040190 0.075295 0.063958 0.267921
252749 2020-01-31 8040668970630857992 23 2.0 0.0 116.252388 24.317703 1.510870 0.188993 0.057409 0.040190 0.075295 0.063958 0.267921
252750 2020-01-31 5068931500364042956 5 0.0 0.0 116.252388 24.317703 1.510870 0.200554 0.062130 0.040190 0.075295 0.063958 0.267921

252751 rows × 14 columns

In [33]:
cr_ctr = daily_funnel_data[daily_funnel_data['dt'] == pd.to_datetime('2020-01-04')]

ctr = (cr_ctr['detailClicks']/cr_ctr['searches']).values
beta_ctr = cr_ctr.ctr.values

cr = (cr_ctr['bookings']/cr_ctr['detailClicks']).fillna(0.).values
beta_cr = cr_ctr.cr.values

fig = ff.create_distplot([ctr, beta_ctr, cr, beta_cr], ['ctr','beta_ctr', 'cr', 'beta_cr'])
fig.show()
In [34]:
daily_funnel_data = daily_funnel_data[['dt','vhid','cr','ctr']]
daily_funnel_data
Out[34]:
dt vhid cr ctr
0 2020-01-04 4534496020989252375 0.008702 0.027656
1 2020-01-04 7045690507975564635 0.033977 0.052002
2 2020-01-04 822661676043348693 0.036612 0.037609
3 2020-01-04 4797644311105828251 0.029702 0.051518
4 2020-01-04 9141059828240372790 0.038608 0.058640
... ... ... ... ...
252746 2020-01-31 1314584744655476113 0.062130 0.164029
252747 2020-01-31 4377937175711091410 0.062130 0.185274
252748 2020-01-31 4665667295838472450 0.062130 0.205642
252749 2020-01-31 8040668970630857992 0.057409 0.188993
252750 2020-01-31 5068931500364042956 0.062130 0.200554

252751 rows × 4 columns

Daily data merged

In [35]:
def weighted_sum(values, historical_weight=0.66, normalize_zero=True):
    result = values.copy()
    for index, curr in enumerate(values[1:]):
        prev = result[index]
        if normalize_zero:
            result[index+1] = (historical_weight * prev) + ((1 - historical_weight) * curr)
        else:
            result[index+1] = prev if curr == 0 else (historical_weight * prev) + ((1 - historical_weight) * curr)
    return result
In [36]:
session_start = pd.to_datetime('2020-01-01')
session_end = pd.to_datetime('2020-02-29')
def historical_weighted(df):
    vhid = df.vhid.unique()[0]
    df.drop(columns='vhid', inplace=True)
    df.set_index('dt',inplace=True)
    t_index = pd.date_range(start=session_start, end=session_end, freq='D', closed='left')
    df = df.reindex(t_index).fillna(0.)
    for col in df.columns:
        if col == 'rn':
            df[col] = df[col].cumsum()
        elif col in ['cr','ctr']:
            df[col] = weighted_sum(df[col])
        elif col in ['asp']:
            df[col] = weighted_sum(df[col], normalize_zero=False)
        else:
            df[col] = df[col]
    return df
In [37]:
daily_data = daily_funnel_data.merge(transactions, on=['vhid','dt'], how='outer')

daily_data['rn'] = daily_data['rn'].fillna(0.)

daily_data = daily_data.merge(mean_asp_by_vhid, on=['vhid'], how='left')
daily_data.loc[daily_data['asp_x'].isna(), 'asp_x'] = daily_data['asp_y']
daily_data = daily_data.merge(mean_asp_by_date, on=['dt'], how='left')
daily_data.loc[daily_data['asp_x'].isna(), 'asp_x'] = daily_data['asp']
daily_data = daily_data.drop(columns=['asp_y','asp']).rename(columns={'asp_x':'asp'})

daily_data = daily_data.groupby('vhid').apply(historical_weighted)
daily_data = daily_data.reset_index().rename(columns={'level_1':'dt'})

daily_data = daily_data.merge(ingo_content_data, on='vhid', how='outer')
daily_data = daily_data[daily_data['dt_y'] < daily_data['dt_x']]
daily_data['t'] = (daily_data['dt_x'] - daily_data['dt_y'])
daily_data['rank'] = daily_data.groupby('vhid')['t'].rank(method='min')
min_rank = daily_data.groupby(['vhid','dt_x']).agg({'rank':'min'}).reset_index()
daily_data = daily_data.merge(min_rank, on=['vhid','dt_x'], how='left')
daily_data = daily_data[daily_data['rank_x'] == daily_data['rank_y']]
daily_data = daily_data.drop(columns=['dt_y','t','rank_x','rank_y']).rename(columns={'dt_x':'dt'})

scaler = preprocessing.MinMaxScaler()
daily_data = daily_data.set_index(['vhid','dt'])
daily_data = pd.DataFrame(scaler.fit_transform(daily_data), index=daily_data.index, columns=daily_data.columns)
daily_data = daily_data.reset_index()
In [41]:
# scaler = preprocessing.MinMaxScaler()
# daily_data = daily_data.set_index(['vhid','dt'])
# daily_data = pd.DataFrame(scaler.fit_transform(daily_data), index=daily_data.index, columns=daily_data.columns)
# daily_data = daily_data.reset_index()
In [38]:
daily_data
Out[38]:
vhid dt cr ctr rn asp description hotelamenities roomamenities hotelimage roomimage roomdetails
0 100095815031859733 2020-01-02 5.145601e-01 4.261843e-01 0.0 0.083297 1.0 1.0 1.0 0.598214 1.0 1.0
1 100095815031859733 2020-01-03 5.101002e-01 4.034899e-01 0.0 0.081817 1.0 1.0 1.0 0.598214 1.0 1.0
2 100095815031859733 2020-01-04 5.071171e-01 3.802101e-01 0.0 0.079840 1.0 1.0 1.0 0.598214 1.0 1.0
3 100095815031859733 2020-01-05 5.088787e-01 3.721175e-01 0.0 0.078745 1.0 1.0 1.0 0.598214 1.0 1.0
4 100095815031859733 2020-01-06 5.181259e-01 3.798255e-01 0.0 0.079165 1.0 1.0 1.0 0.598214 1.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ...
273496 999539595327370963 2020-02-24 2.578606e-09 3.419796e-09 0.0 0.077916 1.0 1.0 0.0 1.000000 1.0 1.0
273497 999539595327370963 2020-02-25 1.701880e-09 2.257066e-09 0.0 0.077916 1.0 1.0 0.0 1.000000 1.0 1.0
273498 999539595327370963 2020-02-26 1.123241e-09 1.489663e-09 0.0 0.077916 1.0 1.0 0.0 1.000000 1.0 1.0
273499 999539595327370963 2020-02-27 7.413388e-10 9.831778e-10 0.0 0.077916 1.0 1.0 0.0 1.000000 1.0 1.0
273500 999539595327370963 2020-02-28 4.892836e-10 6.488974e-10 0.0 0.077916 1.0 1.0 0.0 1.000000 1.0 1.0

273501 rows × 12 columns

Merged static and daily data

In [47]:
merged = static_data.merge(daily_data, how='inner', left_on='index', right_on='vhid').drop(columns=['index'])
merged['vhid'] = merged['vhid'].astype('int64')
merged['dt'] = merged['dt'].dt.date
merged
Out[47]:
recommendation_val recentwindowrating relativerating bayesian_rating 1star 0star 3star 2star 4star 5star gohomes go_stay alt_acco goibibo_main cottage service_apartment villa hotel homestay guest_house aparthotel resort tent hostel bnb beach_hut holiday_homes farm_stay bungalow lodge houseboat oyo bayesian_nps vhid dt cr ctr rn asp description hotelamenities roomamenities hotelimage roomimage roomdetails
0 0.845238 0.676471 0.148222 0.736897 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249 100095815031859733 2020-01-02 5.145601e-01 4.261843e-01 0.0 0.083297 1.0 1.0 1.0 0.598214 1.0 1.0
1 0.845238 0.676471 0.148222 0.736897 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249 100095815031859733 2020-01-03 5.101002e-01 4.034899e-01 0.0 0.081817 1.0 1.0 1.0 0.598214 1.0 1.0
2 0.845238 0.676471 0.148222 0.736897 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249 100095815031859733 2020-01-04 5.071171e-01 3.802101e-01 0.0 0.079840 1.0 1.0 1.0 0.598214 1.0 1.0
3 0.845238 0.676471 0.148222 0.736897 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249 100095815031859733 2020-01-05 5.088787e-01 3.721175e-01 0.0 0.078745 1.0 1.0 1.0 0.598214 1.0 1.0
4 0.845238 0.676471 0.148222 0.736897 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.343249 100095815031859733 2020-01-06 5.181259e-01 3.798255e-01 0.0 0.079165 1.0 1.0 1.0 0.598214 1.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
273728 0.702381 0.676471 0.059704 0.732422 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.343249 999539595327370963 2020-02-24 2.578606e-09 3.419796e-09 0.0 0.077916 1.0 1.0 0.0 1.000000 1.0 1.0
273729 0.702381 0.676471 0.059704 0.732422 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.343249 999539595327370963 2020-02-25 1.701880e-09 2.257066e-09 0.0 0.077916 1.0 1.0 0.0 1.000000 1.0 1.0
273730 0.702381 0.676471 0.059704 0.732422 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.343249 999539595327370963 2020-02-26 1.123241e-09 1.489663e-09 0.0 0.077916 1.0 1.0 0.0 1.000000 1.0 1.0
273731 0.702381 0.676471 0.059704 0.732422 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.343249 999539595327370963 2020-02-27 7.413388e-10 9.831778e-10 0.0 0.077916 1.0 1.0 0.0 1.000000 1.0 1.0
273732 0.702381 0.676471 0.059704 0.732422 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.343249 999539595327370963 2020-02-28 4.892836e-10 6.488974e-10 0.0 0.077916 1.0 1.0 0.0 1.000000 1.0 1.0

273733 rows × 45 columns

In [48]:
merged.dtypes
Out[48]:
recommendation_val    float64
recentwindowrating    float64
relativerating        float64
bayesian_rating       float64
1star                 float64
0star                 float64
3star                 float64
2star                 float64
4star                 float64
5star                 float64
gohomes               float64
go_stay               float64
alt_acco              float64
goibibo_main          float64
cottage               float64
service_apartment     float64
villa                 float64
hotel                 float64
homestay              float64
guest_house           float64
aparthotel            float64
resort                float64
tent                  float64
hostel                float64
bnb                   float64
beach_hut             float64
holiday_homes         float64
farm_stay             float64
bungalow              float64
lodge                 float64
houseboat             float64
oyo                   float64
bayesian_nps          float64
vhid                  int64  
dt                    object 
cr                    float64
ctr                   float64
rn                    float64
asp                   float64
description           float64
hotelamenities        float64
roomamenities         float64
hotelimage            float64
roomimage             float64
roomdetails           float64
dtype: object

Session data

In [51]:
session_data = pd.read_csv('goa/session_data.csv', parse_dates=['searchDate'])
In [52]:
feb_data = session_data[session_data['searchDate']>=pd.to_datetime('2020-02-01')]
feb_data['date'] = feb_data['searchDate'].dt.date - datetime.timedelta(days=1)
# feb_data.to_csv('goa/feb_data.csv')
feb_data.shape
Out[52]:
(18960397, 8)
In [49]:
feb_data.dtypes
Out[49]:
searchDate     datetime64[ns]
flavour        object        
los            int64         
ap             int64         
pax_bucket     object        
searchVhid     int64         
is_relevant    bool          
date           object        
dtype: object
In [43]:
feb_data = feb_data.drop(columns=['Unnamed: 0'])
In [53]:
data = feb_data.merge(merged, left_on=['searchVhid','date'], right_on=['vhid','dt'])
data.drop(columns=['searchVhid','date','dt'], inplace=True)
data.set_index(['vhid','searchDate'], inplace=True)
# data.to_csv('goa/feb_data_merged.csv')
data
Out[53]:
flavour los ap pax_bucket is_relevant recommendation_val recentwindowrating relativerating bayesian_rating 1star 0star 3star 2star 4star 5star gohomes go_stay alt_acco goibibo_main cottage service_apartment villa hotel homestay guest_house aparthotel resort tent hostel bnb beach_hut holiday_homes farm_stay bungalow lodge houseboat oyo bayesian_nps cr ctr rn asp description hotelamenities roomamenities hotelimage roomimage roomdetails
vhid searchDate
2829921501140013519 2020-02-23 ios 4 9 couple False 0.821429 0.500000 0.329077 0.830693 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.577803 0.031068 0.120766 0.139612 0.087898 1.000000 1.000000 1.0 1.000000 1.0 1.0
2020-02-23 ios 3 37 other False 0.821429 0.500000 0.329077 0.830693 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.577803 0.031068 0.120766 0.139612 0.087898 1.000000 1.000000 1.0 1.000000 1.0 1.0
2020-02-23 android 2 25 couple True 0.821429 0.500000 0.329077 0.830693 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.577803 0.031068 0.120766 0.139612 0.087898 1.000000 1.000000 1.0 1.000000 1.0 1.0
2020-02-23 android 1 7 couple True 0.821429 0.500000 0.329077 0.830693 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.577803 0.031068 0.120766 0.139612 0.087898 1.000000 1.000000 1.0 1.000000 1.0 1.0
2020-02-23 android 1 6 couple False 0.821429 0.500000 0.329077 0.830693 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.577803 0.031068 0.120766 0.139612 0.087898 1.000000 1.000000 1.0 1.000000 1.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6826681695567059184 2020-02-07 mobile 2 65 couple False 0.845238 0.588235 0.347321 0.833804 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.343249 0.574104 0.692887 0.008864 0.033921 1.000000 1.000000 1.0 0.839286 1.0 1.0
8068090069848903843 2020-02-07 mobile 2 65 couple False 0.964286 0.823529 0.419176 0.838672 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.343249 0.583190 0.818919 0.004986 0.031814 1.000000 1.000000 1.0 0.625000 1.0 1.0
2233338809512339501 2020-02-07 mobile 2 65 couple False 1.000000 0.676471 0.232446 0.757643 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.343249 0.581640 0.741847 0.000000 0.077000 1.000000 1.000000 1.0 0.000000 0.0 1.0
3860612734906885589 2020-02-07 mobile 2 65 couple False 0.892857 0.558824 0.234825 0.782276 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.343249 0.582112 0.834231 0.000554 0.027746 0.016667 0.733333 1.0 1.000000 1.0 1.0
9101479698324692193 2020-02-07 mobile 2 65 couple False 0.845238 0.705882 0.303306 0.826706 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.530892 0.361324 0.549628 0.006094 0.063369 1.000000 0.000000 1.0 1.000000 1.0 1.0

18913041 rows × 48 columns

In [ ]:
nunique = data.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 2].index
d = data.loc[:, data.columns not in [cols_to_drop]]
fig, ax = plt.subplots(figsize=(15,8))
sns.heatmap(d.corr(),
            xticklabels=d.columns,
            yticklabels=d.columns)
In [ ]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(15,8))
sns.heatmap(data.corr(),
            xticklabels=data.columns,
            yticklabels=data.columns)
In [ ]:
data[]
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Booking data

In [ ]:
booking_data = pd.read_csv('goa/bookings.csv',
                           dtype={'vhid':'object'},
                           parse_dates=['bookingTime','checkin','checkout'])

# booking_data['bookingDate'] = booking_data['bookingTime'].dt.date
booking_data = booking_data[booking_data['cancelled_bookings']<=0] # filter out cancelled bookings
booking_data = dropColumnsWithSameValue(booking_data)
booking_data['asp'] = booking_data['gmv'] / booking_data['confirmedBookings']
booking_data.drop(columns=['gmv'], inplace=True)

print(booking_data.shape)
booking_data.head()

# take ingo transaction for domestic

Details data

In [ ]:
# path = r'/Users/saman.tamkeen/Desktop/goa/details/'
# all_files = glob.glob(path + "/*.csv")
# # all_files
# li = []

# for i, filename in enumerate(all_files):
#     d = pd.read_csv(filename,
#                     names=['detailTime','detailId','vcid','flavour','checkin','checkout','pax','userId','email','trackingId','vhid'],
#                     dtype={'vcid':'object','vhid':'object','userId':'object'},
#                     parse_dates=['detailTime','checkin','checkout'])
#     d = d[(d['detailTime'] >= startDate) & (d['detailTime'] <= endDate)]
#     if d.shape[0] > 0:
#         li.append(d)
#     print(i)
    
# details_data = pd.concat(li, axis=0, ignore_index=True)
# print(details_data.shape)
In [ ]:
details_data = pd.read_csv('goa/details.csv',
                           dtype={'vhid':'object'},
                           parse_dates=['detailTime','checkin','checkout'])
print(details_data.shape)
details_data.head()

Detail to bookings

In [ ]:
detail2booking = details_data.merge(booking_data, how='left', on=['trackingId','vhid'])
detail2booking.shape
In [ ]:
detail2booking = detail2booking[(pd.isna(detail2booking['bookingTime'])) |
                                ((detail2booking['bookingTime'] > detail2booking['detailTime']) &
                                 ((detail2booking['bookingTime'] - detail2booking['detailTime']).dt.total_seconds() <= 60*60))]
detail2booking.shape
In [ ]:
detail2booking['rank'] = detail2booking.groupby("bookingId")["detailTime"].rank(ascending=False, method='dense')
detail2booking = detail2booking[(pd.isna(detail2booking['bookingTime'])) | (detail2booking['rank'] == 1.)]
detail2booking.drop(columns=['flavour_y','rank'], inplace=True)
detail2booking.rename(columns={'flavour_x':'flavour',
                               'checkin_x':'checkinDetail',
                               'checkout_x':'checkoutDetail',
                               'checkin_y':'checkinBooking',
                               'checkout_y':'checkoutBooking'}, inplace=True)
detail2booking.shape
In [ ]:
detail2booking.drop_duplicates(subset=['detailId'], inplace=True)
detail2booking.shape
In [ ]:
before_memory = memory_usage(detail2booking)

# optimize storage
detail2booking['flavour'] = detail2booking['flavour'].astype('category')
# detail2booking['checkinDetail'] = detail2booking['checkinDetail'].dt.date
# detail2booking['checkoutDetail'] = detail2booking['checkoutDetail'].dt.date
detail2booking['vhid'] = detail2booking['vhid'].astype('int64')
# detail2booking['checkinBooking'] = detail2booking['checkinBooking'].dt.date
# detail2booking['checkoutBooking'] = detail2booking['checkoutBooking'].dt.date
detail2booking['rooms'] = detail2booking['rooms'].fillna(0).astype('int8')
detail2booking['adults'] = detail2booking['adults'].fillna(0).astype('int8')
detail2booking['children'] = detail2booking['children'].fillna(0).astype('int8')
detail2booking['status'] = detail2booking['status'].astype('category')
detail2booking['paymode'] = detail2booking['paymode'].fillna(0).astype('int8')
detail2booking['confirmedBookings'] = detail2booking['confirmedBookings'].fillna(0).astype('int8')
detail2booking['asp']= detail2booking['asp'].astype('float32')

after_memory = memory_usage(detail2booking)

print((before_memory - after_memory)/before_memory)
In [ ]:
d2b = detail2booking[detail2booking['detailTime']>pd.to_datetime('2020-01-01 00:00:00+05:30')]
d2b.shape
In [ ]:
d2b.to_csv('goa/detail2booking.csv', index=False)
In [ ]:
detail2booking['asp'].round(2).max()
In [ ]:
d2b.head()
In [ ]:
 

Detail to booking

In [ ]:
d2b = pd.read_csv('goa/detail2booking.csv',
                  parse_dates=['detailTime','bookingTime','checkinDetail','checkoutDetail','checkinBooking','checkoutBooking'],
                  dtype={'flavour':'category','vhid':'int64','rooms':'int8','adults':'int8','children':'int8',
                         'status':'category','paymode':'int8','confirmedBookings':'int8','asp':'float32'})
d2b.shape
In [ ]:
d2b.dtypes
In [ ]:
# d2b['checkinDetail'] = d2b['checkinDetail'].dt.strftime('%Y%m%d').astype('int32')
# d2b['checkoutDetail'] = d2b['checkoutDetail'].dt.strftime('%Y%m%d').astype('int32')
# d2b['checkinBooking'] = pd.to_numeric(d2b['checkinBooking'].dt.strftime('%Y%m%d'), errors='coerce', downcast='integer').astype('int32')
# d2b['checkoutBooking'] = pd.to_numeric(d2b['checkoutBooking'].dt.strftime('%Y%m%d'), errors='coerce', downcast='integer').fillna(0).astype('int32')
# d2b['checkoutBooking'] = d2b['checkoutBooking'].dt.strftime('%Y%m%d', errors='coerce').astype('int32')
d2b['checkinBooking'] = d2b['checkinBooking'].fillna(0).astype('int32')
d2b.dtypes
In [ ]:
d2b['checkinBooking'] = d2b['checkinBooking'].replace(0., np.NaN)
d2b['checkoutBooking'] = d2b['checkoutBooking'].replace(0., np.NaN)
In [ ]:
d2b.to_csv('goa/detail2booking.csv', index=False)

Searches

In [ ]:
pd.read_csv('/Users/saman.tamkeen/Desktop/goa/searches/part-00032-tid-361649420680128099-70bce0fc-603c-4aa9-9c5f-854b2a85f7db-3732-1-c000.csv')

XGBoost

In [ ]:
# read in data
dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)
preds

Rough

In [ ]:
# uniqueBookings = detail2booking.merge(uniqueBookings, on=['bookingId','detailTime']).drop_duplicates(subset=['detailId'], keep='first')
# uniqueBookings.shape
d = detail2booking.drop_duplicates(subset='detailId',keep='first')
d.to_csv('goa/details_deduped.csv', index=False)
In [ ]:
# d.head()
d.drop(columns=['vcid','userId','email']).to_csv('goa/details_deduped.csv', index=False)
In [ ]:
uniqueBookings[uniqueBookings['bookingId']=='HTL24LQ54Q']
In [ ]:
detail2booking.groupby('detailId').agg({'detailTime':'count'})
In [ ]:
d = details_data.drop_duplicates()
print(d.shape)
d.to_csv('goa/details_deduped.csv', index=False)
In [ ]:
booking_data.groupby('bookingId').agg({'bookingTime':'count'})
In [ ]:
detail2booking.head()['detailTime'].dt.floor('H')
In [ ]:
detail2booking.head()['detailTime']
In [ ]:
detail2booking.bookingId.unique()
In [100]:
import json
def json_parser(data):
    try:
        return json.loads(data)
    except Exception as ex:
        return None
In [ ]:
df = pd.read_csv('f1.csv', 
                 names=['name','dob','stats'],
                 converters={'stats':CustomParser})
df
In [ ]:
import json
json.loads('{"starRatings":[],"locationIds":[],"locationNames":[],"amenities":[],"propertyTypes":[],"tags":[],"chains":[],"priceRange":[],"offerText":[],"locationSearchForwardParams":[],"gosuggestSearchParamContainer":[{"Vhid":"660156700734546743","searchType":1}],"locationSearchParamContainer":[],"latLongSearchParamContainer":[],"UGCRange":[],"HappySubContextList":[]}')
In [ ]:
s = pd.read_csv('/Users/saman.tamkeen/Desktop/searchSample.csv',
                names=['vcid','flavour','checkIn','checkOut','pax','userId','email','trackingId','sort','filters',
                       'filterInput','algo','sessionId','searchTime','hotels'],
                delimiter='|', escapechar='\\', error_bad_lines=False,
                parse_dates=['searchTime'],
                converters={'filters':json_parser,'filterInput':json_parser,'hotels':json_parser},
                dtype={'flavour':'category','checkIn':'int32','checkout':'int32','sort':'category','algo':'category'})
s.drop(columns=['vcid','userId','email','sessionId'],inplace=True)
s.head()
In [ ]:
s = pd.read_csv('/Users/saman.tamkeen/Desktop/goa/session_data/part-00080-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8058-1-c000.csv', 
            delimiter='|', escapechar='\\',
            parse_dates=['searchTime','detailTime','bookingTime','checkIn','checkOut'],
            converters={'filters':json_parser})

s['los'] = (s['checkOut'] - s['checkIn']).dt.days
s['ap'] = (s['checkIn'] - pd.to_datetime(s['searchTime'].dt.date)).dt.days
s['pax_bucket'] = np.select([(s['pax'] == '1-1_0'),(s['pax'] == '1-2_0')], ['solo', 'couple'], default='other')
s['is_relevant'] = s['detailTime'].notna()
s = s[['flavour','logged_in','los','ap','pax_bucket','searchVhid','is_relevant']]
In [ ]:
def get_filter_value(r):
    filterValue = {}
    for f in r.filters_new:
        try:
            if f == 'gosuggestSearchParamContainer':
                filterValue[f] = [k['Vhid'] for k in r.filterInput[f] if 'Vhid' in k]
            elif f == 'locationSearchForwardParams':
                filterValue[f] = [k['locationID'] if k['idType'] != 'ggl_id' else k['name'] for k in r.filterInput[f]]
            elif f == 'locationSearchParamContainer': # contains google/voyager code
                filterValue[f] = [k['locationId'] for k in r.filterInput[f] if 'locationId' in k]
            else:
                filterValue[f] = r.filterInput[f]
        except:
            print(f, r.filterInput[f])
            filterValue[f] = r.filterInput[f]
    return filterValue
In [ ]:
# s = s[['filters','filterInput']]
s['filterValue'] = s.apply(get_filter_value, axis=1)
s.head()
In [ ]:
# filterInput:struct
#     starRatings:array
#         element:float
#     locationIds:array
#         element:string
#     locationNames:array
#         element:string
#     amenities:array
#         element:string
#     propertyTypes:array
#         element:string
#     tags:array
#         element:string
#     chains:array
#         element:string
#     hotelPriceBucket:float
#     priceRange:array
#         element:float
#     hotelName:string
#     isSlot:float
#     payMode:float
#     offerText:array
#         element:string
#     isFreeCancellation:float
#     isReserveNowPayLater:float
#     isGoBiz:float
#     isPah:float
#     isCoupleFriendly:float
#     isBookedByContacts:float
#     isEarlyCheckin:float
#     PolygonId:string
#     locationSearchForwardParams:array
#         element:struct
#             locationID:string
#             latitude:float
#             longitude:float
#             radius:float
#             idType:string
#             name:string
#     gosuggestSearchParamContainer:array
#         element:struct
#             Vhid:string
#             searchType:long
#     locationSearchParamContainer:array
#         element:struct
#             locationId:string
#             idType:string
#     latLongSearchParamContainer:array
#         element:struct
#             latitude:string
#             longitude:string
#     UGCRange:array
#         element:float
#     HappyContext:string
#     HappySubContextList:array
#         element:string
#     HappyContextType:string
s[(s['filters'].str[0] == 'gosuggestSearchParamContainer') & (s['filterValue'].str.contains('Vhid'))]
In [ ]:
def get_filters(filterInput):
    if filterInput is None:
        return []
    return [k for k,v in filterInput.items() if not(v is None or (type(v) == list and (len(v) == 0 or sum([len(e) if type(e) == dict else 1 for e in v]) == 0)))]
In [ ]:
# s['filters_new'] = s['filterInput'].apply(get_filters)
s[s['filters'] != s['filters_new']]
In [103]:
path = r'/Users/saman.tamkeen/Desktop/goa/session_data/'
all_files = glob.glob(path + "/*.csv")
li = []
errors = []

for i, filename in enumerate(all_files):
    try:
        s = pd.read_csv(filename, 
                    delimiter='|', escapechar='\\',
                    parse_dates=['searchTime','detailTime','bookingTime','checkIn','checkOut'])

        s['los'] = (s['checkOut'] - s['checkIn']).dt.days
        s['searchDate'] = pd.to_datetime(s['searchTime'].dt.date)
        s['ap'] = (s['checkIn'] - s['searchDate']).dt.days
        s['pax_bucket'] = np.select([(s['pax'] == '1-1_0'),(s['pax'] == '1-2_0')], ['solo', 'couple'], default='other')
        s['is_relevant'] = s['detailTime'].notna()
        s = s[['searchDate','flavour','los','ap','pax_bucket','searchVhid','is_relevant']]
        if s.shape[0] > 0:
            li.append(s)
        print(i, filename.split('/')[-1])
    except:
        errors.append(filename)
    
session_data = pd.concat(li, axis=0, ignore_index=True)
print(errors)
print(session_data.shape)
0 part-00158-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8114-1-c000.csv
1 part-00080-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8058-1-c000.csv
2 part-00148-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8098-1-c000.csv
3 part-00121-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8080-1-c000.csv
4 part-00112-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8092-1-c000.csv
5 part-00042-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8007-1-c000.csv
6 part-00040-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8006-1-c000.csv
7 part-00110-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8070-1-c000.csv
8 part-00168-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8130-1-c000.csv
9 part-00036-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7999-1-c000.csv
10 part-00161-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8138-1-c000.csv
11 part-00029-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7997-1-c000.csv
12 part-00109-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8082-1-c000.csv
13 part-00174-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8141-1-c000.csv
14 part-00084-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8046-1-c000.csv
15 part-00129-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8099-1-c000.csv
16 part-00132-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8095-1-c000.csv
17 part-00096-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8064-1-c000.csv
18 part-00069-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8040-1-c000.csv
19 part-00139-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8115-1-c000.csv
20 part-00061-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8036-1-c000.csv
21 part-00160-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8135-1-c000.csv
22 part-00138-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8109-1-c000.csv
23 part-00187-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8149-1-c000.csv
24 part-00095-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8065-1-c000.csv
25 part-00007-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7978-1-c000.csv
26 part-00105-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8066-1-c000.csv
27 part-00173-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8153-1-c000.csv
28 part-00179-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8146-1-c000.csv
29 part-00072-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8032-1-c000.csv
30 part-00116-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8081-1-c000.csv
31 part-00026-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7991-1-c000.csv
32 part-00104-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8085-1-c000.csv
33 part-00122-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8105-1-c000.csv
34 part-00183-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8152-1-c000.csv
35 part-00159-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8137-1-c000.csv
36 part-00092-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8054-1-c000.csv
37 part-00191-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8156-1-c000.csv
38 part-00044-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8027-1-c000.csv
39 part-00054-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8015-1-c000.csv
40 part-00049-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8031-1-c000.csv
41 part-00190-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8161-1-c000.csv
42 part-00041-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8025-1-c000.csv
43 part-00091-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8062-1-c000.csv
44 part-00066-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8030-1-c000.csv
45 part-00097-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8061-1-c000.csv
46 part-00136-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8107-1-c000.csv
47 part-00035-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8014-1-c000.csv
48 part-00151-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8119-1-c000.csv
49 part-00145-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8097-1-c000.csv
50 part-00163-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8117-1-c000.csv
51 part-00081-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8041-1-c000.csv
52 part-00113-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8074-1-c000.csv
53 part-00074-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8039-1-c000.csv
54 part-00166-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8140-1-c000.csv
55 part-00012-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7994-1-c000.csv
56 part-00005-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7971-1-c000.csv
57 part-00038-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8002-1-c000.csv
58 part-00071-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8045-1-c000.csv
59 part-00048-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8017-1-c000.csv
60 part-00016-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7996-1-c000.csv
61 part-00063-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8023-1-c000.csv
62 part-00082-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8047-1-c000.csv
63 part-00131-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8125-1-c000.csv
64 part-00030-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8011-1-c000.csv
65 part-00153-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8134-1-c000.csv
66 part-00093-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8069-1-c000.csv
67 part-00062-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8038-1-c000.csv
68 part-00024-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7983-1-c000.csv
69 part-00111-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8072-1-c000.csv
70 part-00019-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7979-1-c000.csv
71 part-00003-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7969-1-c000.csv
72 part-00070-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8042-1-c000.csv
73 part-00169-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8150-1-c000.csv
74 part-00123-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8083-1-c000.csv
75 part-00144-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8123-1-c000.csv
76 part-00021-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7990-1-c000.csv
77 part-00128-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8087-1-c000.csv
78 part-00149-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8102-1-c000.csv
79 part-00098-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8071-1-c000.csv
80 part-00141-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8116-1-c000.csv
81 part-00181-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8154-1-c000.csv
82 part-00050-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8009-1-c000.csv
83 part-00124-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8094-1-c000.csv
84 part-00172-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8136-1-c000.csv
85 part-00118-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8104-1-c000.csv
86 part-00130-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8093-1-c000.csv
87 part-00056-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8016-1-c000.csv
88 part-00020-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7998-1-c000.csv
89 part-00155-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8132-1-c000.csv
90 part-00125-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8110-1-c000.csv
91 part-00022-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8010-1-c000.csv
92 part-00088-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8060-1-c000.csv
93 part-00004-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7973-1-c000.csv
94 part-00076-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8044-1-c000.csv
95 part-00177-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8147-1-c000.csv
96 part-00152-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8131-1-c000.csv
97 part-00014-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7989-1-c000.csv
98 part-00043-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8000-1-c000.csv
99 part-00068-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8034-1-c000.csv
100 part-00033-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7992-1-c000.csv
101 part-00195-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8162-1-c000.csv
102 part-00060-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8019-1-c000.csv
103 part-00058-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8035-1-c000.csv
104 part-00178-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8142-1-c000.csv
105 part-00009-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7977-1-c000.csv
106 part-00052-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8020-1-c000.csv
107 part-00027-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7995-1-c000.csv
108 part-00057-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8022-1-c000.csv
109 part-00037-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8005-1-c000.csv
110 part-00196-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8163-1-c000.csv
111 part-00186-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8151-1-c000.csv
112 part-00199-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8168-1-c000.csv
113 part-00114-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8084-1-c000.csv
114 part-00000-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7972-1-c000.csv
115 part-00180-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8143-1-c000.csv
116 part-00090-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8059-1-c000.csv
117 part-00115-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8076-1-c000.csv
118 part-00175-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8145-1-c000.csv
119 part-00117-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8101-1-c000.csv
120 part-00154-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8106-1-c000.csv
121 part-00107-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8091-1-c000.csv
122 part-00189-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8157-1-c000.csv
123 part-00171-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8139-1-c000.csv
124 part-00164-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8122-1-c000.csv
125 part-00023-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7986-1-c000.csv
126 part-00011-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7988-1-c000.csv
127 part-00176-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8144-1-c000.csv
128 part-00185-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8158-1-c000.csv
129 part-00006-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7974-1-c000.csv
130 part-00188-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8165-1-c000.csv
131 part-00100-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8068-1-c000.csv
132 part-00002-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7976-1-c000.csv
133 part-00142-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8090-1-c000.csv
134 part-00075-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8043-1-c000.csv
135 part-00018-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7985-1-c000.csv
136 part-00047-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8018-1-c000.csv
137 part-00127-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8120-1-c000.csv
138 part-00065-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8024-1-c000.csv
139 part-00032-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8013-1-c000.csv
140 part-00106-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8088-1-c000.csv
141 part-00134-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8086-1-c000.csv
142 part-00083-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8049-1-c000.csv
143 part-00103-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8078-1-c000.csv
144 part-00073-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8051-1-c000.csv
145 part-00197-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8166-1-c000.csv
146 part-00077-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8053-1-c000.csv
147 part-00157-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8124-1-c000.csv
148 part-00045-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8008-1-c000.csv
149 part-00064-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8029-1-c000.csv
150 part-00053-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8012-1-c000.csv
151 part-00192-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8164-1-c000.csv
152 part-00184-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8155-1-c000.csv
153 part-00028-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7987-1-c000.csv
154 part-00013-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7981-1-c000.csv
155 part-00156-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8113-1-c000.csv
156 part-00194-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8160-1-c000.csv
157 part-00055-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8021-1-c000.csv
158 part-00099-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8075-1-c000.csv
159 part-00017-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7984-1-c000.csv
160 part-00094-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8063-1-c000.csv
161 part-00147-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8111-1-c000.csv
/Users/saman.tamkeen/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3058: DtypeWarning:

Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.

162 part-00067-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8033-1-c000.csv
163 part-00086-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8050-1-c000.csv
164 part-00120-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8079-1-c000.csv
165 part-00108-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8067-1-c000.csv
166 part-00001-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7970-1-c000.csv
167 part-00198-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8167-1-c000.csv
168 part-00078-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8052-1-c000.csv
169 part-00101-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8073-1-c000.csv
170 part-00010-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7975-1-c000.csv
171 part-00089-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8056-1-c000.csv
172 part-00059-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8026-1-c000.csv
173 part-00193-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8159-1-c000.csv
174 part-00140-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8103-1-c000.csv
175 part-00046-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8028-1-c000.csv
176 part-00008-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7982-1-c000.csv
177 part-00137-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8127-1-c000.csv
178 part-00102-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8077-1-c000.csv
179 part-00079-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8057-1-c000.csv
180 part-00162-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8126-1-c000.csv
181 part-00051-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8037-1-c000.csv
182 part-00039-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8004-1-c000.csv
183 part-00150-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8112-1-c000.csv
184 part-00031-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8001-1-c000.csv
185 part-00015-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7980-1-c000.csv
186 part-00133-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8096-1-c000.csv
187 part-00087-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8055-1-c000.csv
188 part-00119-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8089-1-c000.csv
189 part-00025-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-7993-1-c000.csv
190 part-00143-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8121-1-c000.csv
191 part-00170-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8133-1-c000.csv
192 part-00182-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8148-1-c000.csv
193 part-00165-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8128-1-c000.csv
194 part-00034-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8003-1-c000.csv
195 part-00085-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8048-1-c000.csv
196 part-00135-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8100-1-c000.csv
197 part-00146-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8108-1-c000.csv
198 part-00167-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8129-1-c000.csv
199 part-00126-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8118-1-c000.csv
[]
(42566137, 7)
In [104]:
session_data.head()
Out[104]:
searchDate flavour los ap pax_bucket searchVhid is_relevant
0 2020-01-01 android 3 14 other 1646863333716315870 False
1 2020-01-01 android 3 14 other 1380978004115375903 False
2 2020-01-01 android 3 14 other 2777571699355626229 False
3 2020-01-01 android 3 14 other 7313545420952717620 False
4 2020-01-01 android 3 14 other 5489734261718422793 False
In [105]:
session_data.to_csv('goa/session_data.csv', index=False)
In [ ]:
pd.read_csv('/Users/saman.tamkeen/Desktop/goa/daily_funnel_data/part-00160-tid-8719775214049067280-83283836-dd94-488e-9fb0-686ff67620e3-7769-1-c000.csv',delimiter='|')
In [ ]:
search_data_2 = pd.concat([search_data,search_data_2], axis=0, ignore_index=True)
print(search_data_2.shape)
In [ ]:
path = r'/Users/saman.tamkeen/Desktop/goa/searches/'
all_files = glob.glob(path + "/*.csv")
li = []

for i, filename in enumerate(all_files):
    print(i, filename.split('/')[-1])
In [ ]:
os.remove('/Users/saman.tamkeen/Desktop/nohup.out')
In [ ]:
from itertools import zip_longest
a = [1,2,3]
b = ['a','b','c']
c = []
[(_1,_2,_3) for _1,_2,_3 in zip_longest(a,b,c)]
In [ ]: